#Checking the persepctive API reliability
#Alan Yan
#3-16-2020

#clear enviornment
rm(list = ls())

#set working directory
#setwd(dir = "c:/Users/Alan Yan/Dropbox/NGA/Gender Name Testing/Clean Folder/04-Perspective-API/01-code")

#load library
library(pacman)
p_load(tidyverse)

#load data
tox_scores <- read.csv("04-Perspective-API/data/02-clean-data/api_coded_texts.csv", header = TRUE, stringsAsFactors = FALSE)
exp1 <- read.csv("01-Experiment-1/data/04-clean-data/clean_data.csv", header = TRUE, stringsAsFactors = FALSE)
exp2 <- read.csv("02-Experiment-2/data/04-clean-data/clean_data.csv", header = TRUE, stringsAsFactors = FALSE)

#merge
exp1 <- left_join(x = exp1, y = tox_scores,
                       by = c("conversation_id" = "id"))
exp2 <- left_join(x = exp2, y = tox_scores,
                       by = c("message_id" = "id"))

#if NA in tox score, I will umpute a 0
exp1$tox_score <- ifelse(is.na(exp1$tox_score) == TRUE, 0, exp1$tox_score)
exp2$tox_score <- ifelse(is.na(exp2$tox_score) == TRUE, 0, exp2$tox_score)

####First study
exp1$pure.silencing <- ifelse(exp1$opted_out == "true" & exp1$Offensive == 100, 1, 0)
exp1$withdrawal <- ifelse(exp1$opted_out == "true" & exp1$Offensive == 0, 1, 0)
#correlation between offensiveness score and tox score
cor(exp1$tox_score, exp1$Offensive)
cor(exp1$tox_score, exp1$opted_out)
cor(exp1$tox_score, exp1$pure.silencing)
cor(exp1$tox_score, exp1$withdrawal)
#subset out 0's in tox score to see if the correlation is driven by the 0's
cor(exp1$tox_score[exp1$tox_score != 0], exp1$Offensive[exp1$tox_score != 0])
cor(exp1$tox_score[exp1$tox_score != 0], exp1$opted_out[exp1$tox_score != 0])
cor(exp1$tox_score[exp1$tox_score != 0], exp1$pure.silencing[exp1$tox_score != 0])
#the correlations are modest at best 

#see if dichotomizing creates a better measure
exp1$tox_score.dichotomous <- ifelse(exp1$tox_score > median(exp1$tox_score), 1, 0)
#correlation between offensiveness score and tox score
cor(exp1$tox_score.dichotomous, exp1$Offensive)
cor(exp1$tox_score.dichotomous, exp1$opted_out)
cor(exp1$tox_score.dichotomous, exp1$pure.silencing)
#subset out 0's in tox score to see if the correlation is driven by the 0's
#not great performance

####second study
exp2$pure.silencing <- ifelse(exp2$opted_out == "TRUE" & exp2$offensive.index > 0, 1, 0)
exp2$withdrawal <- ifelse(exp2$opted_out == "TRUE" & exp2$offensive.index == 0, 1, 0)
#correlation between offensiveness scores and tox scores
cor(exp2$tox_score, exp2$offensive.index)
#correaltion between discouraging scores and tox score
cor(exp2$tox_score, exp2$discouraging.index)
#correlation between silencing and tox score
cor(exp2$tox_score, exp2$silenced)
#correlation between pure silencing and tox score
cor(exp2$tox_score, exp2$pure.silencing)
#correlation between withdrawal and tox score
cor(exp2$tox_score, exp2$pure.silencing)
#the correlations are modest at best

#check if dichotomizing creates a better measure
exp2$tox_score.dichotomous <- ifelse(exp2$tox_score > median(exp2$tox_score), 1, 0)
#correlation between offensiveness scores and tox scores
cor(exp2$tox_score.dichotomous, exp2$offensiveness.index)
#correaltion between discouraging scores and tox score
cor(exp2$tox_score.dichotomous, exp2$discouraging.index)
#correlation between silencing and tox score
cor(exp2$tox_score.dichotomous, exp2$silenced)
#correlation between pure silencing and tox score
cor(exp2$tox_score.dichotomous, exp2$pure.silencing)
#the correlations are modest at best
#good for discouraging and silenced, but terrible for offensiveness